import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(42)
df = pd.read_csv('classroom_actions.csv')
df.head()
# Create dataframe with all control records
control_df = df.query('group == "control"')
# Compute completion rate
control_cr = control_df.completed.mean()
control_cr2 = len(control_df.query('completed==True')) / len(control_df)
# Display completion rate
control_cr, control_cr2
# Create dataframe with all experiment records
experiment_df = df.query('group=="experiment"')
# Compute completion rate
experiment_ctr = experiment_df.completed.mean()
# Display completion rate
experiment_ctr
# Compute observed difference in completion rates
obs_diff = experiment_ctr - control_cr
# Display observed difference in completion rates
obs_diff
# Create sampling distribution for difference in completion rates
# with boostrapping
diffs = []
for _ in range(10000):
sample = df.sample(len(df), replace=True)
sample_control_cr = sample.query('group=="control"').completed.mean()
sample_experiment_cr = sample.query('group=="experiment"').completed.mean()
diffs.append(sample_experiment_cr - sample_control_cr)
# convert to numpy array
diffs = np.array(diffs)
# plot distribution
plt.hist(diffs)
# create distribution under the null hypothesis
null_vals = np.random.normal(0, diffs.std(), diffs.shape[0])
# plot null distribution
plt.hist(null_vals)
# plot line for observed statistic
plt.axvline(diffs.mean(), color='r')
# compute p value
(null_vals > diffs.mean()).mean()